import plotly
plotly.offline.init_notebook_mode()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
data_diabetes = datasets.load_diabetes()
# Converting into dataframe
df_diabetes = pd.DataFrame(data_diabetes.data,columns=data_diabetes.feature_names)
df_diabetes['y'] = data_diabetes.target
df_diabetes.head()
| age | sex | bmi | bp | s1 | s2 | s3 | s4 | s5 | s6 | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.038076 | 0.050680 | 0.061696 | 0.021872 | -0.044223 | -0.034821 | -0.043401 | -0.002592 | 0.019907 | -0.017646 | 151.0 |
| 1 | -0.001882 | -0.044642 | -0.051474 | -0.026328 | -0.008449 | -0.019163 | 0.074412 | -0.039493 | -0.068332 | -0.092204 | 75.0 |
| 2 | 0.085299 | 0.050680 | 0.044451 | -0.005670 | -0.045599 | -0.034194 | -0.032356 | -0.002592 | 0.002861 | -0.025930 | 141.0 |
| 3 | -0.089063 | -0.044642 | -0.011595 | -0.036656 | 0.012191 | 0.024991 | -0.036038 | 0.034309 | 0.022688 | -0.009362 | 206.0 |
| 4 | 0.005383 | -0.044642 | -0.036385 | 0.021872 | 0.003935 | 0.015596 | 0.008142 | -0.002592 | -0.031988 | -0.046641 | 135.0 |
df_describe = df_diabetes.describe()
df_describe
| age | sex | bmi | bp | s1 | s2 | s3 | s4 | s5 | s6 | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 442.000000 |
| mean | -2.511817e-19 | 1.230790e-17 | -2.245564e-16 | -4.797570e-17 | -1.381499e-17 | 3.918434e-17 | -5.777179e-18 | -9.042540e-18 | 9.293722e-17 | 1.130318e-17 | 152.133484 |
| std | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 77.093005 |
| min | -1.072256e-01 | -4.464164e-02 | -9.027530e-02 | -1.123988e-01 | -1.267807e-01 | -1.156131e-01 | -1.023071e-01 | -7.639450e-02 | -1.260971e-01 | -1.377672e-01 | 25.000000 |
| 25% | -3.729927e-02 | -4.464164e-02 | -3.422907e-02 | -3.665608e-02 | -3.424784e-02 | -3.035840e-02 | -3.511716e-02 | -3.949338e-02 | -3.324559e-02 | -3.317903e-02 | 87.000000 |
| 50% | 5.383060e-03 | -4.464164e-02 | -7.283766e-03 | -5.670422e-03 | -4.320866e-03 | -3.819065e-03 | -6.584468e-03 | -2.592262e-03 | -1.947171e-03 | -1.077698e-03 | 140.500000 |
| 75% | 3.807591e-02 | 5.068012e-02 | 3.124802e-02 | 3.564379e-02 | 2.835801e-02 | 2.984439e-02 | 2.931150e-02 | 3.430886e-02 | 3.243232e-02 | 2.791705e-02 | 211.500000 |
| max | 1.107267e-01 | 5.068012e-02 | 1.705552e-01 | 1.320436e-01 | 1.539137e-01 | 1.987880e-01 | 1.811791e-01 | 1.852344e-01 | 1.335973e-01 | 1.356118e-01 | 346.000000 |
df_diabetes_corr = df_diabetes.corr()
print(df_diabetes_corr)
plt.figure(figsize=(12,10))
sns.heatmap(df_diabetes_corr, annot=True)
plt.title('Heatmap of Correlation Matrix of Diabetes Dataset')
plt.show()
age sex bmi bp s1 s2 s3 \
age 1.000000 0.173737 0.185085 0.335428 0.260061 0.219243 -0.075181
sex 0.173737 1.000000 0.088161 0.241010 0.035277 0.142637 -0.379090
bmi 0.185085 0.088161 1.000000 0.395411 0.249777 0.261170 -0.366811
bp 0.335428 0.241010 0.395411 1.000000 0.242464 0.185548 -0.178762
s1 0.260061 0.035277 0.249777 0.242464 1.000000 0.896663 0.051519
s2 0.219243 0.142637 0.261170 0.185548 0.896663 1.000000 -0.196455
s3 -0.075181 -0.379090 -0.366811 -0.178762 0.051519 -0.196455 1.000000
s4 0.203841 0.332115 0.413807 0.257650 0.542207 0.659817 -0.738493
s5 0.270774 0.149916 0.446157 0.393480 0.515503 0.318357 -0.398577
s6 0.301731 0.208133 0.388680 0.390430 0.325717 0.290600 -0.273697
y 0.187889 0.043062 0.586450 0.441482 0.212022 0.174054 -0.394789
s4 s5 s6 y
age 0.203841 0.270774 0.301731 0.187889
sex 0.332115 0.149916 0.208133 0.043062
bmi 0.413807 0.446157 0.388680 0.586450
bp 0.257650 0.393480 0.390430 0.441482
s1 0.542207 0.515503 0.325717 0.212022
s2 0.659817 0.318357 0.290600 0.174054
s3 -0.738493 -0.398577 -0.273697 -0.394789
s4 1.000000 0.617859 0.417212 0.430453
s5 0.617859 1.000000 0.464669 0.565883
s6 0.417212 0.464669 1.000000 0.382483
y 0.430453 0.565883 0.382483 1.000000
# Check for missing values
df_missing = df_diabetes.isnull().sum()
df_missing
age 0 sex 0 bmi 0 bp 0 s1 0 s2 0 s3 0 s4 0 s5 0 s6 0 y 0 dtype: int64
from sklearn.model_selection import train_test_split
X = df_diabetes[['bmi','s5']]
y= df_diabetes['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error,r2_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
# models
models = {
# Polynomial regression models
'poly_2':make_pipeline(PolynomialFeatures(2), LinearRegression()),
'poly_3':make_pipeline(PolynomialFeatures(3), LinearRegression()),
# Decision tree models
'tree_3':tree.DecisionTreeRegressor(max_depth=3),
'tree_5':tree.DecisionTreeRegressor(max_depth=5),
# KNN models
'knn_3':KNeighborsRegressor(n_neighbors=3),
'knn_5':KNeighborsRegressor(n_neighbors=5),
}
# Using croos validation to evaluate models
# using dictionary to store the scores(multiple metrics)
scoring = {'R2': 'r2',
'MAE': make_scorer(mean_absolute_error),
'MAPE': make_scorer(mean_absolute_percentage_error)
}
# Loop through the models and get the scores
result_scores = {}
for name, model in models.items():
scores = cross_validate(model, X_train, y_train, cv=5, scoring=scoring)
result_scores[name] = scores
result_scores
{'poly_2': {'fit_time': array([0.00200057, 0.00300074, 0.00299859, 0.00200582, 0.00200057]),
'score_time': array([0.00099945, 0.00200009, 0.00100136, 0.0009985 , 0.00100017]),
'test_R2': array([0.43228581, 0.42249479, 0.45512034, 0.45045543, 0.48604324]),
'test_MAE': array([50.58558101, 45.36980759, 46.13161352, 43.38074686, 47.6912952 ]),
'test_MAPE': array([0.36562587, 0.33692878, 0.41377779, 0.46593697, 0.48479903])},
'poly_3': {'fit_time': array([0.00199652, 0.00099921, 0.00100064, 0.00199962, 0.0019989 ]),
'score_time': array([0.00099802, 0.00100183, 0.00099826, 0.00100255, 0.00100017]),
'test_R2': array([0.45041869, 0.40973122, 0.46174792, 0.45830304, 0.48175925]),
'test_MAE': array([49.38501678, 46.05930554, 45.32848821, 43.26921459, 47.49344929]),
'test_MAPE': array([0.35798695, 0.34411498, 0.40471704, 0.46212987, 0.47836029])},
'tree_3': {'fit_time': array([0.00200033, 0.00099945, 0.0010004 , 0.00099945, 0.00099945]),
'score_time': array([0. , 0.0010016 , 0. , 0. , 0.00100017]),
'test_R2': array([0.43701682, 0.32167862, 0.31959226, 0.37353078, 0.37159155]),
'test_MAE': array([52.33118645, 47.51945251, 48.4535481 , 46.80048981, 51.84335318]),
'test_MAPE': array([0.37821915, 0.3387406 , 0.43151498, 0.49029801, 0.5176862 ])},
'tree_5': {'fit_time': array([0.0010004 , 0.00099802, 0.00099921, 0.00103164, 0.00099468]),
'score_time': array([0. , 0. , 0.00099993, 0.00099254, 0. ]),
'test_R2': array([ 0.37452979, 0.14172192, -0.00584287, 0.31735482, 0.33896895]),
'test_MAE': array([54.42537419, 54.43036146, 60.9255727 , 47.67353586, 51.05476334]),
'test_MAPE': array([0.39391912, 0.40540775, 0.51201066, 0.46162774, 0.49458176])},
'knn_3': {'fit_time': array([0.00099969, 0.00099993, 0. , 0.00101805, 0.00100017]),
'score_time': array([0.00099969, 0. , 0.00097847, 0.00100017, 0.00100017]),
'test_R2': array([0.31750281, 0.23206554, 0.2593057 , 0.47176982, 0.48446127]),
'test_MAE': array([59.25268817, 50.6827957 , 50.85483871, 42.88172043, 47.81967213]),
'test_MAPE': array([0.42963723, 0.3830881 , 0.42509481, 0.40206842, 0.42513757])},
'knn_5': {'fit_time': array([0.00098705, 0.0009923 , 0.0010004 , 0.00099945, 0. ]),
'score_time': array([0.00200582, 0.00099683, 0.0010004 , 0.00100017, 0.00100017]),
'test_R2': array([0.42483855, 0.30885881, 0.34909146, 0.47613233, 0.50973674]),
'test_MAE': array([54.16129032, 50.4483871 , 47.41612903, 41.58064516, 46.61967213]),
'test_MAPE': array([0.38140511, 0.38836045, 0.40980985, 0.41520996, 0.43397499])}}
summary_table = {}
for name, score in result_scores.items():
summary_table[name] = {
'R2_mean': score['test_R2'].mean(),
'R2_std': score['test_R2'].std(),
'MAE_mean': score['test_MAE'].mean(),
'MAE_std': score['test_MAE'].std(),
'MAPE_mean': score['test_MAPE'].mean(),
'MAPE_std': score['test_MAPE'].std()
}
summary_table = pd.DataFrame(summary_table)
summary_table
| poly_2 | poly_3 | tree_3 | tree_5 | knn_3 | knn_5 | |
|---|---|---|---|---|---|---|
| R2_mean | 0.449280 | 0.452392 | 0.364682 | 0.233347 | 0.353021 | 0.413732 |
| R2_std | 0.021878 | 0.023701 | 0.042990 | 0.144051 | 0.105879 | 0.075380 |
| MAE_mean | 46.631809 | 46.307095 | 49.389606 | 53.701922 | 50.298343 | 48.045225 |
| MAE_std | 2.415246 | 2.055667 | 2.269404 | 4.395089 | 5.325183 | 4.180914 |
| MAPE_mean | 0.413414 | 0.409462 | 0.431292 | 0.453509 | 0.413005 | 0.405752 |
| MAPE_std | 0.056548 | 0.053783 | 0.066825 | 0.046989 | 0.017802 | 0.018962 |
models['poly_3'].fit(X_train, y_train)
y_pred = models['poly_3'].predict(X_test)
r2_test = r2_score(y_test, y_pred)
mse_test = mean_absolute_error(y_test, y_pred)
mape_test = mean_absolute_percentage_error(y_test, y_pred)
print('Test Data')
print(f'R2 score: {r2_test}')
print(f'MSE: {mse_test}')
print(f'MAPE: {mape_test}')
Test Data R2 score: 0.43185766610962883 MSE: 46.2130411464925 MAPE: 0.43000619514776434
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
x_surf = np.linspace(df_diabetes.bmi.min(), df_diabetes.bmi.max(), 100)
y_surf = np.linspace(df_diabetes.s5.min(), df_diabetes.s5.max(), 100)
x_surf, y_surf = np.meshgrid(x_surf, y_surf)
# Predictions
int_coff = models['poly_3'].steps[-1][1]
y_pred_new = int_coff.intercept_ + int_coff.coef_[0] * x_surf + int_coff.coef_[1] * y_surf
fig = plt.figure(figsize=(20,10))
### Set figure size
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df_diabetes['bmi'],df_diabetes['s5'],c='red', marker='o', alpha=0.5)
ax.plot_surface(x_surf,y_surf,y_pred_new, color='b', alpha=0.3)
ax.set_xlabel('BMI')
ax.set_ylabel('S5')
ax.set_zlabel('Y')
plt.show()
Model Limitations:
Failure